(Author: Zifan Wang)
Before diving into the the multivariable analyses, we first wish to get an overall view of how delay times are distributed across different regions. We will use interactive maps to describe the delay patterns in different US states, cities, and by different flight routes.
Sys.setenv("plotly_username"="ziwang970")
Sys.setenv("plotly_api_key"="Rh542AcijT2qJ07JZsQY")
# read in dataset
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
df <- read.csv("C:/Users/ziwan/Desktop/2018 Fall Courses/BST 260/Project/dataset/flight2017.csv")
In this step, we will describe the average delay times of each state:
# calculate mean departure delay minutes by state
state_delay <- df %>%
group_by(ORIGIN_STATE_ABR) %>%
summarize(mean_delay = mean(DEP_DELAY_NEW, na.rm = TRUE))
# give state boundaries white borders
l <- list(color = toRGB("white"), width = 2)
# specify some map projection/options
g <- list(
scope = 'usa',
projection = list(type = 'albers usa'),
showlakes = TRUE,
lakecolor = toRGB('white')
)
# make the plot
p <- plot_geo(state_delay, locationmode = 'USA-states') %>%
add_trace(
z = ~mean_delay, locations = ~ORIGIN_STATE_ABR,
color = ~mean_delay, colors = 'Purples'
) %>%
colorbar(title = "Departure delay in minutes") %>%
layout(
title = '2017 average departure delay (minutes) by states',
geo = g
)
p
From the plot, we see that in general, the Northeast region of the US had experienced longer delay times in 2017 (States like Maine or Vermont had average delay times over 20 minutes). For other regions, there seems to be relatively long delay times in the South and the West coast.
We then look at the delay time patterns for each departure city: The delay times are categorized into 4 quartiles and shown by colored bubbles, and the size of the bubbles depicts the length of delay time:
# calculate mean departure delay minutes by city
city_delay <- df %>%
group_by(ORIGIN_CITY_NAME) %>%
summarize(mean_delay = mean(DEP_DELAY_NEW, na.rm = TRUE))
library(splitstackshape)
city_delay <- cSplit(city_delay, "ORIGIN_CITY_NAME", sep=",")
city_delay <- city_delay %>% mutate(name = ORIGIN_CITY_NAME_1)
# add the coordination of cities
coordinate <- read.csv('https://raw.githubusercontent.com/plotly/datasets/master/2014_us_cities.csv')
city_delay <- city_delay %>% mutate(name = trimws(as.character(name)))
coordinate <- coordinate %>% mutate(name = trimws(as.character(name)))
merged_city_delay <- left_join(city_delay,coordinate, by='name')
merged_city_delay <- merged_city_delay %>%
group_by(name) %>%
summarize(mean_delay = mean(mean_delay, na.rm = TRUE), lat = mean(lat), lon = mean(lon))
# draw the plot by cities
merged_city_delay$q <- with(merged_city_delay, cut(mean_delay, quantile(mean_delay)))
levels(merged_city_delay$q) <- paste(c("1st", "2nd", "3rd", "4th", "5th"), "Quantile")
merged_city_delay$q <- as.ordered((merged_city_delay$q))
g <- list(
scope = 'usa',
projection = list(type = 'albers usa'),
showland = TRUE,
landcolor = toRGB("gray85"),
subunitwidth = 1,
countrywidth = 1,
subunitcolor = toRGB("white"),
countrycolor = toRGB("white")
)
p <- plot_geo(merged_city_delay, locationmode = 'USA-states', sizes = c(1, 250)) %>%
add_markers(
x = ~lon, y = ~lat, size = ~mean_delay, color = ~q, hoverinfo = "text",
text = ~paste(merged_city_delay$name, "<br />", merged_city_delay$mean_delay, "minutes")
) %>%
layout(title = '2017 average departure delay (minutes) by city', geo = g)
p
## Warning: Ignoring 102 observations
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
From the plot, we see that similar to the plot by states, cities in the Northeast, South and the West coast are more likely to have delay times at the highest (yellow) or second highest (green) quartiles, with some cities (e.g. St. Augustine in Florida) reaching average delays of more than 60 minutes. Cities with the shortest average delay times are generally in the Midwest area.
Next, we look at the flight routes with delays: we will display the routes with an average delay time of 15+, 30+, 60+, and 90+ minutes in 2017:
# group by flight routes and calculate mean departure delay
route_delay <- df %>%
group_by(ORIGIN_CITY_NAME, DEST_CITY_NAME) %>%
summarize(mean_delay = mean(DEP_DELAY_NEW, na.rm = TRUE))
library(splitstackshape)
route_delay <- cSplit(route_delay, "ORIGIN_CITY_NAME", sep=",")
route_delay <- cSplit(route_delay, "DEST_CITY_NAME", sep=",")
route_delay <- route_delay %>% mutate(name1 = ORIGIN_CITY_NAME_1, name2 = DEST_CITY_NAME_1)
# add the coordination of cities
coordinate <- read.csv('https://raw.githubusercontent.com/plotly/datasets/master/2014_us_cities.csv')
route_delay <- route_delay %>% mutate(name1 = trimws(as.character(name1)), name2 = trimws(as.character(name2)))
coordinate <- coordinate %>% mutate(name = trimws(as.character(name)))
merged_1 <- left_join(route_delay,coordinate, by = c("name1" = "name")) %>%
rename(lat1 = lat, lon1 = lon, pop1 = pop) %>%
select(mean_delay, name1, name2, pop1, lat1, lon1)
merged_2 <- left_join(route_delay,coordinate, by = c("name2" = "name")) %>%
rename(lat2 = lat, lon2 = lon, pop2 = pop) %>%
select(mean_delay, name1, name2, pop2, lat2, lon2)
merged_route_delay <- left_join(merged_1, merged_2, by = c("name1", "name2")) %>%
rename(mean_delay = mean_delay.x) %>%
select(mean_delay, name1, name2, pop1, lat1, lon1, pop2, lat2, lon2)
merged_route_delay <- merged_route_delay %>% # get the mean population for each city
group_by(name1, name2) %>%
summarize(mean_delay = mean(mean_delay, na.rm = TRUE),
pop1 = mean(pop1, na.rm = TRUE), pop2 = mean(pop2, na.rm = TRUE),
lat1 = mean(lat1, na.rm = TRUE), lon1 = mean(lon1, na.rm = TRUE),
lat2 = mean(lat2, na.rm = TRUE), lon2 = mean(lon2, na.rm = TRUE))
# map projection
# restrict to >15, >30, >60, >90 minutes of delay
delay15 <-merged_route_delay %>%
filter(mean_delay >= 15)
delay30 <-merged_route_delay %>%
filter(mean_delay >= 30)
delay60 <-merged_route_delay %>%
filter(mean_delay >= 60)
delay90 <-merged_route_delay %>%
filter(mean_delay >= 90)
geo <- list(
scope = 'north america',
projection = list(type = 'azimuthal equal area'),
showland = TRUE,
landcolor = toRGB("gray95"),
countrycolor = toRGB("gray80")
)
p1 <- plot_geo(locationmode = 'USA-states', color = I("red")) %>%
add_markers(
data = delay15, x = ~lon1, y = ~lat1, text = ~name1,
size = ~pop1, hoverinfo = "text", alpha = 0.5
) %>%
add_markers(
data = delay15, x = ~lon2, y = ~lat2, text = ~name2,
size = ~pop2, hoverinfo = "text", alpha = 0.5
) %>%
add_segments(
x = ~lon1, xend = ~lon2,
y = ~lat1, yend = ~lat2,
alpha = 0.3, size = I(1), hoverinfo = "none"
) %>%
layout(
title = '2017 flight routes with >15 min delay',
geo = geo, showlegend = FALSE)
p2 <- plot_geo(locationmode = 'USA-states', color = I("red")) %>%
add_markers(
data = delay30, x = ~lon1, y = ~lat1, text = ~name1,
size = ~pop1, hoverinfo = "text", alpha = 0.5
) %>%
add_markers(
data = delay30, x = ~lon2, y = ~lat2, text = ~name2,
size = ~pop2, hoverinfo = "text", alpha = 0.5
) %>%
add_segments(
x = ~lon1, xend = ~lon2,
y = ~lat1, yend = ~lat2,
alpha = 0.3, size = I(1), hoverinfo = "none"
) %>%
layout(
title = '2017 flight routes with >30 min delay',
geo = geo, showlegend = FALSE)
p3 <- plot_geo(locationmode = 'USA-states', color = I("red")) %>%
add_markers(
data = delay60, x = ~lon1, y = ~lat1, text = ~name1,
size = ~pop1, hoverinfo = "text", alpha = 0.5
) %>%
add_markers(
data = delay60, x = ~lon2, y = ~lat2, text = ~name2,
size = ~pop2, hoverinfo = "text", alpha = 0.5
) %>%
add_segments(
x = ~lon1, xend = ~lon2,
y = ~lat1, yend = ~lat2,
alpha = 0.3, size = I(1), hoverinfo = "none"
) %>%
layout(
title = '2017 flight routes with >60 min delay',
geo = geo, showlegend = FALSE )
p4 <- plot_geo(locationmode = 'USA-states', color = I("red")) %>%
add_markers(
data = delay90, x = ~lon1, y = ~lat1, text = ~name1,
size = ~pop1, hoverinfo = "text", alpha = 0.5
) %>%
add_markers(
data = delay90, x = ~lon2, y = ~lat2, text = ~name2,
size = ~pop2, hoverinfo = "text", alpha = 0.5
) %>%
add_segments(
x = ~lon1, xend = ~lon2,
y = ~lat1, yend = ~lat2,
alpha = 0.3, size = I(1), hoverinfo = "none"
) %>%
layout(
title = '2017 flight routes with >90 min delay',
geo = geo, showlegend = FALSE )
p <- subplot(p1, p2, p3, p4, nrows = 2) %>%
layout(title = "2017 flight routes with different delay times",
xaxis = list(domain=list(x=c(0,0.5),y=c(0,0.5))),
scene = list(domain=list(x=c(0.5,1),y=c(0,0.5))),
xaxis2 = list(domain=list(x=c(0.5,1),y=c(0.5,1))),
annotations = list(
list(x = 0.2 , y = 1, text = ">15 mins", showarrow = F, xref='paper', yref='paper'),
list(x = 0.8 , y = 1, text = ">30 mins", showarrow = F, xref='paper', yref='paper'),
list(x = 0.2 , y = 0.5, text = ">60 mins", showarrow = F, xref='paper', yref='paper'),
list(x = 0.8 , y = 0.5, text = ">90 mins", showarrow = F, xref='paper', yref='paper'))
)
## Warning: Ignoring 262 observations
## Warning: Ignoring 244 observations
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: Ignoring 50 observations
## Warning: Ignoring 48 observations
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: Ignoring 13 observations
## Warning: Ignoring 22 observations
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: Ignoring 2 observations
## Warning: Ignoring 9 observations
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
p
We can see from the plot that as the threshold for delays increases, the number of routes with the corresponding delay time decreases. There have been many routes with average delay times of 15+ minutes in 2017, but only very few of them had an average delay of more than 60 or 90 minutes (e.g. the route between New York and San Antonio).